Data Read In

CSLAP<-read.csv("CSLAP_Dataset_09232019.csv", header=TRUE)
CSLAP$Sample_Year<-as.factor(CSLAP$Sample_Year)
CSLAP$Sample_Month<-as.factor(CSLAP$Sample_Month)

#Split by `Info_Type`
OWCSLAP<-CSLAP[CSLAP$Info_Type == "OW",]
BSCSLAP<-CSLAP[CSLAP$Info_Type == "BS",]
SBCSLAP<-CSLAP[CSLAP$Info_Type == "SB",]
noSBCSLAP<-CSLAP[CSLAP$Info_Type != "SB",]

redCSLAP<-read.csv("redCSLAP.csv")

redOWCSLAP<-redCSLAP[redCSLAP$Info_Type == "OW",]
redBSCSLAP<-redCSLAP[redCSLAP$Info_Type == "BS",]
redSBCSLAP<-redCSLAP[redCSLAP$Info_Type == "SB",]
rednoSBCSLAP<-redCSLAP[redCSLAP$Info_Type != "SB",]

Mixed Effects Models for the Global Dataset (n=69 lakes)

Open Water Total Phosphorus

getwd()
## [1] "/Users/victoriafield/Library/Mobile Documents/com~apple~CloudDocs/Masters Thesis/Chapter Two CSLAP Manuscript/R - data and outputs"
#Select out data of interest
df <- OWCSLAP %>% dplyr::select(TP_mg.L, Dreissenids, CA.SA, Mean_Depth_m, Sample_Year, Sample_Month, LakeID)
df <- df[complete.cases(df), ]

#Check distribution 
##gamma
gamma <- fitdistr(df$TP_mg.L+0.01, "gamma")
qqp(df$TP_mg.L+0.01, "gamma", shape = gamma$estimate[[1]], rate = gamma$estimate[[2]])

## [1] 2665 2659
##log-normal 
qqp(df$TP_mg.L, "lnorm")

## [1] 2665 2659

Model 1 - Full Model, TP log transformed

fit1<-lmer(log(TP_mg.L+.01) ~ Dreissenids + log(CA.SA) + Mean_Depth_m + (1|Sample_Year) + (1|Sample_Month) + (1|LakeID) + (1|Sample_Year:LakeID), data=df)

summary(fit1)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: log(TP_mg.L + 0.01) ~ Dreissenids + log(CA.SA) + Mean_Depth_m +  
##     (1 | Sample_Year) + (1 | Sample_Month) + (1 | LakeID) + (1 |  
##     Sample_Year:LakeID)
##    Data: df
## 
## REML criterion at convergence: -756.6
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.6502 -0.4731 -0.0937  0.3083  9.7084 
## 
## Random effects:
##  Groups             Name        Variance  Std.Dev.
##  Sample_Year:LakeID (Intercept) 0.0175696 0.13255 
##  LakeID             (Intercept) 0.0256021 0.16001 
##  Sample_Month       (Intercept) 0.0001905 0.01380 
##  Sample_Year        (Intercept) 0.0022388 0.04732 
##  Residual                       0.0338332 0.18394 
## Number of obs: 2702, groups:  
## Sample_Year:LakeID, 349; LakeID, 69; Sample_Month, 6; Sample_Year, 6
## 
## Fixed effects:
##                        Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)           -3.721277   0.082613  86.712111 -45.045  < 2e-16 ***
## DreissenidsUninvaded  -0.049672   0.059274 111.130287  -0.838 0.403825    
## log(CA.SA)             0.018196   0.021658  65.195196   0.840 0.403900    
## Mean_Depth_m          -0.022151   0.005685  66.781005  -3.896 0.000229 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) DrssnU l(CA.S
## DrssndsUnnv -0.678              
## log(CA.SA)  -0.545  0.058       
## Mean_Dpth_m -0.294  0.014 -0.206
plot(fit1,col=df$LakeID) #plots model residuals grouped by lake

plot(fit1,col=df$LakeID, id=0.5, idLabels=~.obs, cex=.5) #plots model residuals with labels (observation number) for values outside Normal confidence limits 

plot(fit1,col=df$Sample_Year) #plots model residuals grouped by year 

plot(fit1,col=df$Sample_Year, id=0.5, idLabels=~.obs, cex=.5) #plots model residuals with labels (observation number) for values outside Normal confidence limits 

qqnorm(resid(fit1))
qqline(resid(fit1))

From observation numbers, we can see that those points on the far right of each plot correspond to the following lakes and years: Guilford Lake, East Caroga, Little Long Pond, and Galway Lake. And Year 2015 (blue), Year 2012 (black), Year 2017 (Pink), and Year 2016 (Cyan)

Model 2 - removing “troublesome” lakes

#Select out data of interest
df2 <- df[df$LakeID != "0601GUI0188",]
df2 <- df2[df2$LakeID != "1201ECA0697", ]
df2 <- df2[df2$LakeID != "1701LLO0708", ]
df2 <- df2[df2$LakeID != "1201GAL0563", ]

fit2<-lmer(log(TP_mg.L+.01) ~ Dreissenids + log(CA.SA) + Mean_Depth_m + (1|Sample_Year) + (1|Sample_Month) + (1|LakeID) + (1|Sample_Year:LakeID), data=df2)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
## Model failed to converge with max|grad| = 0.0170704 (tol = 0.002, component 1)
summary(fit2)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: log(TP_mg.L + 0.01) ~ Dreissenids + log(CA.SA) + Mean_Depth_m +  
##     (1 | Sample_Year) + (1 | Sample_Month) + (1 | LakeID) + (1 |  
##     Sample_Year:LakeID)
##    Data: df2
## 
## REML criterion at convergence: -1234.4
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -5.0533 -0.4788 -0.1202  0.3204  7.7073 
## 
## Random effects:
##  Groups             Name        Variance  Std.Dev.
##  Sample_Year:LakeID (Intercept) 0.0070795 0.08414 
##  LakeID             (Intercept) 0.0232003 0.15232 
##  Sample_Month       (Intercept) 0.0002114 0.01454 
##  Sample_Year        (Intercept) 0.0011476 0.03388 
##  Residual                       0.0289562 0.17017 
## Number of obs: 2515, groups:  
## Sample_Year:LakeID, 325; LakeID, 65; Sample_Month, 6; Sample_Year, 6
## 
## Fixed effects:
##                        Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)           -3.735775   0.073489  88.292693 -50.834  < 2e-16 ***
## DreissenidsUninvaded  -0.053305   0.050551 136.143312  -1.054 0.293530    
## log(CA.SA)             0.017758   0.020410  61.089340   0.870 0.387659    
## Mean_Depth_m          -0.021598   0.005322  62.033773  -4.058 0.000141 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) DrssnU l(CA.S
## DrssndsUnnv -0.644              
## log(CA.SA)  -0.571  0.054       
## Mean_Dpth_m -0.304  0.006 -0.213
## convergence code: 0
## Model failed to converge with max|grad| = 0.0170704 (tol = 0.002, component 1)
plot(fit2,col=df2$LakeID) #plots model residuals grouped by lake

plot(fit2,col=df2$LakeID, id=0.5, idLabels=~.obs, cex=.5) #plots model residuals with labels (observation number) for values outside Normal confidence limits 

plot(fit2,col=df2$Sample_Year) #plots model residuals grouped by year 

plot(fit2,col=df2$Sample_Year, id=0.5, idLabels=~.obs, cex=.5) #plots model residuals with labels (observation number) for values outside Normal confidence limits 

qqnorm(resid(fit2))
qqline(resid(fit2))

Removing lakes improves residual plot, but not QQ plot

Model 3 - removing “troublesome” Year 2015

#Select out data of interest
df3 <- df[df$Sample_Year != "2015",]


fit3<-lmer(log(TP_mg.L+.01) ~ Dreissenids + log(CA.SA) + Mean_Depth_m + (1|Sample_Year) + (1|Sample_Month) + (1|LakeID) + (1|Sample_Year:LakeID), data=df3)

summary(fit3)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: log(TP_mg.L + 0.01) ~ Dreissenids + log(CA.SA) + Mean_Depth_m +  
##     (1 | Sample_Year) + (1 | Sample_Month) + (1 | LakeID) + (1 |  
##     Sample_Year:LakeID)
##    Data: df3
## 
## REML criterion at convergence: -1128.2
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -5.0512 -0.4997 -0.1188  0.3060 10.4227 
## 
## Random effects:
##  Groups             Name        Variance  Std.Dev.
##  Sample_Year:LakeID (Intercept) 5.260e-03 0.072526
##  LakeID             (Intercept) 2.742e-02 0.165578
##  Sample_Month       (Intercept) 2.429e-05 0.004929
##  Sample_Year        (Intercept) 1.690e-03 0.041110
##  Residual                       2.903e-02 0.170379
## Number of obs: 2272, groups:  
## Sample_Year:LakeID, 292; LakeID, 69; Sample_Month, 6; Sample_Year, 5
## 
## Fixed effects:
##                        Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)           -3.764171   0.077047  96.364793 -48.855  < 2e-16 ***
## DreissenidsUninvaded  -0.049569   0.051430 169.912048  -0.964 0.336509    
## log(CA.SA)             0.023500   0.021539  65.718248   1.091 0.279229    
## Mean_Depth_m          -0.019589   0.005633  66.493563  -3.477 0.000898 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) DrssnU l(CA.S
## DrssndsUnnv -0.630              
## log(CA.SA)  -0.571  0.048       
## Mean_Dpth_m -0.311  0.010 -0.204
plot(fit3,col=df3$LakeID) #plots model residuals grouped by lake

plot(fit3,col=df3$LakeID, id=0.5, idLabels=~.obs, cex=.5) #plots model residuals with labels (observation number) for values outside Normal confidence limits 

plot(fit3,col=df3$Sample_Year) #plots model residuals grouped by year 

plot(fit3,col=df3$Sample_Year, id=0.5, idLabels=~.obs, cex=.5) #plots model residuals with labels (observation number) for values outside Normal confidence limits 

qqnorm(resid(fit3))
qqline(resid(fit3))

Removing 2015 improves neither residuals nor QQ

Model 3 - removing “troublesome” lakes AND 2015

#Select out data of interest
df4 <- df[df$LakeID != "0601GUI0188",]
df4 <- df4[df4$LakeID != "1201ECA0697", ]
df4 <- df4[df4$LakeID != "1701LLO0708", ]
df4 <- df4[df4$LakeID != "1201GAL0563", ]
df4 <- df4[df4$Sample_Year != "2015",]

fit4<-lmer(log(TP_mg.L+.01) ~ Dreissenids + log(CA.SA) + Mean_Depth_m + (1|Sample_Year) + (1|Sample_Month) + (1|LakeID) + (1|Sample_Year:LakeID), data=df4)

summary(fit4)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: log(TP_mg.L + 0.01) ~ Dreissenids + log(CA.SA) + Mean_Depth_m +  
##     (1 | Sample_Year) + (1 | Sample_Month) + (1 | LakeID) + (1 |  
##     Sample_Year:LakeID)
##    Data: df4
## 
## REML criterion at convergence: -1376.8
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -5.4509 -0.5243 -0.1206  0.3339  8.2501 
## 
## Random effects:
##  Groups             Name        Variance  Std.Dev.
##  Sample_Year:LakeID (Intercept) 3.534e-03 0.059448
##  LakeID             (Intercept) 2.470e-02 0.157154
##  Sample_Month       (Intercept) 1.454e-05 0.003813
##  Sample_Year        (Intercept) 1.275e-03 0.035706
##  Residual                       2.515e-02 0.158588
## Number of obs: 2114, groups:  
## Sample_Year:LakeID, 272; LakeID, 65; Sample_Month, 6; Sample_Year, 5
## 
## Fixed effects:
##                        Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)           -3.767159   0.072209  91.587601 -52.170  < 2e-16 ***
## DreissenidsUninvaded  -0.048822   0.046906 174.969528  -1.041 0.299381    
## log(CA.SA)             0.017846   0.020804  61.360106   0.858 0.394355    
## Mean_Depth_m          -0.018813   0.005418  61.964622  -3.472 0.000947 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) DrssnU l(CA.S
## DrssndsUnnv -0.606              
## log(CA.SA)  -0.584  0.046       
## Mean_Dpth_m -0.315  0.004 -0.213
plot(fit4,col=df4$LakeID) #plots model residuals grouped by lake

plot(fit4,col=df4$LakeID, id=0.5, idLabels=~.obs, cex=.5) #plots model residuals with labels (observation number) for values outside Normal confidence limits 

plot(fit4,col=df4$Sample_Year) #plots model residuals grouped by year 

plot(fit4,col=df4$Sample_Year, id=0.5, idLabels=~.obs, cex=.5) #plots model residuals with labels (observation number) for values outside Normal confidence limits 

qqnorm(resid(fit4))
qqline(resid(fit4))

Removing “bad” lakes and Year 2015 improves residual plot the most, but the QQ line is still wonky

Alternate distributions using Generalized Linear Mixed Effects Model